{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f1a9eb90-335c-4214-8bb6-fd1edbe3ccbd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# My OpenAI Key\n",
    "import os\n",
    "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "88a9f2e3-c729-455a-a338-2f83776c1d4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "import sys\n",
    "\n",
    "logging.basicConfig(stream=sys.stdout, level=logging.INFO)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "be3f7baa-1c0a-430b-981b-83ddca9e71f2",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Using Knowledge Graph"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "75f1d565-04e8-41bc-9165-166dc89b6b47",
   "metadata": {},
   "source": [
    "#### Building the Knowledge Graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "8d0b2364-4806-4656-81e7-3f6e4b910b5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index import SimpleDirectoryReader, LLMPredictor, ServiceContext\n",
    "from llama_index.indices.knowledge_graph.base import GPTKnowledgeGraphIndex\n",
    "from langchain import OpenAI\n",
    "from IPython.display import Markdown, display"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "1c297fd3-3424-41d8-9d0d-25fe6310ab62",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "61679142-7595-492b-8792-26cbc439caf8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# define LLM\n",
    "# NOTE: at the time of demo, text-davinci-002 did not have rate-limit errors\n",
    "llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name=\"text-davinci-002\"))\n",
    "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, chunk_size_limit=512)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "370fd08f-56ff-4c24-b0c4-c93116a6d482",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# NOTE: can take a while! \n",
    "index = GPTKnowledgeGraphIndex.from_documents(\n",
    "    documents, \n",
    "    max_triplets_per_chunk=2,\n",
    "    service_context=service_context\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "0b4fe9b6-5762-4e86-b51e-aac45d3ecdb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "index.save_to_disk('index_kg.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "5eec265d-211b-4f26-b05b-5b4e7072bc6e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# try loading\n",
    "new_index = GPTKnowledgeGraphIndex.load_from_disk('index_kg.json', service_context=service_context)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c39a0eeb-ef16-4982-8ba8-b37c2c5f4437",
   "metadata": {},
   "source": [
    "#### Querying the Knowledge Graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "670300d8-d0a8-4201-bbcd-4a74b199fcdd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:root:> Starting query: Tell me more about Interleaf\n",
      "INFO:root:> Query keywords: ['history', 'company', 'Interleaf', 'software']\n",
      "ERROR:root:Index was not constructed with embeddings, skipping embedding usage...\n",
      "INFO:root:> Extracted relationships: The following are knowledge triplets in the form of (subset, predicate, object):\n",
      "('Interleaf', 'made software for', 'creating documents')\n",
      "('Interleaf', 'added', 'scripting language')\n",
      "('software', 'generate', 'web sites')\n",
      "INFO:root:> Building index from nodes: 0 chunks\n",
      "INFO:root:> [query] Total LLM token usage: 312 tokens\n",
      "INFO:root:> [query] Total embedding token usage: 0 tokens\n"
     ]
    }
   ],
   "source": [
    "response = new_index.query(\n",
    "    \"Tell me more about Interleaf\", \n",
    "    include_text=False, \n",
    "    response_mode=\"tree_summarize\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "eecf2d57-3efa-4b0d-941a-95438d42893c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "<b>\n",
       "Interleaf was a software company that made software for creating documents. They later added a scripting language to their software, which allowed users to generate web sites.</b>"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(Markdown(f\"<b>{response}</b>\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "bd14686d-1c53-4637-9340-3745f2121ae2",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:root:> Starting query: Tell me more about what the author worked on at Interleaf\n",
      "INFO:root:> Query keywords: ['author', 'Interleaf', 'work']\n",
      "ERROR:root:Index was not constructed with embeddings, skipping embedding usage...\n",
      "INFO:root:> Querying with idx: ed39a830-a116-41b9-a551-bdd348dba61d: life, we aren't consciously aware of much we're seeing. Most visual perceptio...\n",
      "INFO:root:> Querying with idx: fa1cfbb9-782b-4352-b610-cdae080b8f4f: painting that looks like a certain kind of cartoon, you know it's by Roy Lich...\n",
      "INFO:root:> Extracted relationships: The following are knowledge triplets in the form of (subset, predicate, object):\n",
      "('Interleaf', 'made software for', 'creating documents')\n",
      "('Interleaf', 'added', 'scripting language')\n",
      "INFO:root:> Building index from nodes: 0 chunks\n",
      "INFO:root:> [query] Total LLM token usage: 1254 tokens\n",
      "INFO:root:> [query] Total embedding token usage: 0 tokens\n"
     ]
    }
   ],
   "source": [
    "response = new_index.query(\n",
    "    \"Tell me more about what the author worked on at Interleaf\", \n",
    "    include_text=True, \n",
    "    response_mode=\"tree_summarize\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "b4c87d14-d2d8-4d80-89f6-1e5972973528",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "<b>\n",
       "The author worked on software that allowed users to create documents, similar to Microsoft Word. The software also had a scripting language that was based on Lisp.</b>"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(Markdown(f\"<b>{response}</b>\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ecc7342a",
   "metadata": {},
   "source": [
    "#### Query with embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b20f9da1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:root:> [build_index_from_documents] Total LLM token usage: 24724 tokens\n",
      "INFO:root:> [build_index_from_documents] Total embedding token usage: 1216 tokens\n"
     ]
    }
   ],
   "source": [
    "# NOTE: can take a while! \n",
    "index = GPTKnowledgeGraphIndex.from_documents(\n",
    "    documents, \n",
    "    max_triplets_per_chunk=2,\n",
    "    service_context=service_context,\n",
    "    include_embeddings=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "16d71b91",
   "metadata": {},
   "outputs": [],
   "source": [
    "index.save_to_disk('index_kg_embeddings.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "c051076e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# try loading\n",
    "new_index = GPTKnowledgeGraphIndex.load_from_disk('index_kg_embeddings.json', service_context=service_context)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "01b74b2a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:root:> Starting query: Tell me more about what the author worked on at Interleaf\n",
      "INFO:root:> Query keywords: ['author', 'Interleaf', 'work']\n",
      "INFO:root:> Querying with idx: 16514e26-ef09-404a-87ec-71492ffaeafd: that I could write essays again, I wrote a bunch about topics I'd had stacked...\n",
      "INFO:root:> Querying with idx: e9978fb2-df4c-4d20-8c33-1a0d8a6f2ef9: went in 1988 to visit Rich Draves at CMU, where he was in grad school. One da...\n",
      "INFO:root:> Querying with idx: ec114363-5cf9-4204-94ab-b7117e3475c4: me.\n",
      "\n",
      "So I tried to paint, but I just didn't seem to have any energy or ambiti...\n",
      "INFO:root:> Querying with idx: 59b1330b-95d8-41d0-8e89-b020847ccf70: gradually dragged me down. After a few months it felt disconcertingly like wo...\n",
      "INFO:root:> Querying with idx: 7f832bd0-5aed-49f9-80b7-5e39c3fc39e8: \t\t\n",
      "\n",
      "What I Worked On\n",
      "\n",
      "February 2021\n",
      "\n",
      "Before college the two main things I wor...\n",
      "INFO:root:> Querying with idx: dcb05717-1d6f-40a8-8748-ec4ffc75e77b: from writing essays during most of this time, or I'd never have finished. In ...\n",
      "INFO:root:> Querying with idx: f311317c-6794-4b7e-918d-e68c82747a0e: making paintings and living in New York.\n",
      "\n",
      "I was nervous about money, because ...\n",
      "INFO:root:> Querying with idx: 4bbb384a-e989-4d24-8710-22b547d1686e: had not.) So although Robert had his graduate student stipend, I needed that ...\n",
      "INFO:root:> Querying with idx: f045cfbf-5b62-4717-bfe7-b7a6c9c493f3: urls showed that someone had posted it on Slashdot. [10]\n",
      "\n",
      "Wow, I thought, the...\n",
      "INFO:root:> Querying with idx: b314b8fa-0502-40a5-a318-6b2b4b413cbc: taking philosophy courses and they kept being boring. So I decided to switch ...\n",
      "INFO:root:> Extracted relationships: The following are knowledge triplets in the form of (subset, predicate, object):\n",
      "('I', 'wrote', 'essays')\n",
      "('I', 'wrote', 'programs')\n",
      "('I', 'wrote', 'short stories')\n",
      "('Paul Graham', 'worked on', 'Bel')\n",
      "('Interleaf', 'got crushed by', \"Moore's Law\")\n",
      "INFO:root:> Building index from nodes: 1 chunks\n",
      "INFO:root:> [query] Total LLM token usage: 5607 tokens\n",
      "INFO:root:> [query] Total embedding token usage: 12 tokens\n"
     ]
    }
   ],
   "source": [
    "# query using top 3 triplets plus keywords (duplicate triplets are removed)\n",
    "response = new_index.query(\n",
    "    \"Tell me more about what the author worked on at Interleaf\", \n",
    "    include_text=True, \n",
    "    response_mode=\"tree_summarize\",\n",
    "    embedding_mode='hybrid',\n",
    "    similarity_top_k=5\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "02084f6d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "<b>\n",
       "The author worked on software at Interleaf, specifically the Interleaf 6 program. This program was a sort of graphical user interface for Unix that featured windows and a mouse. The author was the lead engineer on the team that ported the program to the Macintosh, which required rewriting a lot of code and designing a new user interface. In addition to this, the author wrote code for the Interleaf 6 Lisp system, which was used to extend the program.</b>"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(Markdown(f\"<b>{response}</b>\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cd582500-584c-409a-9963-921738f1beb8",
   "metadata": {},
   "source": [
    "#### Visualizing the Graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "b9fe3d26-4f9a-4651-b83f-0018672a34e4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"100%\"\n",
       "            height=\"600px\"\n",
       "            src=\"example.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "            \n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x127e30c70>"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## create graph\n",
    "from pyvis.network import Network\n",
    "\n",
    "g = new_index.get_networkx_graph()\n",
    "net = Network(notebook=True, cdn_resources=\"in_line\", directed=True)\n",
    "net.from_nx(g)\n",
    "net.show(\"example.html\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c2d09ee-22eb-475f-9aed-60f85a2b572f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llama_index",
   "language": "python",
   "name": "llama_index"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
